1.繪製全球的感染與康復人數,以國家別區分
2.利用地理資訊圖繪製全球感染、康復人數
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
pd.set_option('display.max_rows', None)
from plotly.subplots import make_subplots
import seaborn as sns
import datetime
df = pd.read_csv('C:/Users/abcd8/OneDrive/桌面/resume/project5/covid_19_data.csv',encoding='utf-8')
df.head()
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 01/22/2020 | Anhui | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| 1 | 2 | 01/22/2020 | Beijing | Mainland China | 1/22/2020 17:00 | 14.0 | 0.0 | 0.0 |
| 2 | 3 | 01/22/2020 | Chongqing | Mainland China | 1/22/2020 17:00 | 6.0 | 0.0 | 0.0 |
| 3 | 4 | 01/22/2020 | Fujian | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| 4 | 5 | 01/22/2020 | Gansu | Mainland China | 1/22/2020 17:00 | 0.0 | 0.0 | 0.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 156292 entries, 0 to 156291 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SNo 156292 non-null int64 1 ObservationDate 156292 non-null object 2 Province/State 111979 non-null object 3 Country/Region 156292 non-null object 4 Last Update 156292 non-null object 5 Confirmed 156292 non-null float64 6 Deaths 156292 non-null float64 7 Recovered 156292 non-null float64 dtypes: float64(3), int64(1), object(4) memory usage: 9.5+ MB
df.describe()
| SNo | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|
| count | 156292.000000 | 1.562920e+05 | 156292.000000 | 1.562920e+05 |
| mean | 78146.500000 | 2.844340e+04 | 966.090273 | 1.793232e+04 |
| std | 45117.758473 | 8.809800e+04 | 3278.110680 | 9.813018e+04 |
| min | 1.000000 | -3.028440e+05 | -178.000000 | -8.544050e+05 |
| 25% | 39073.750000 | 3.900000e+02 | 5.000000 | 1.000000e+01 |
| 50% | 78146.500000 | 3.410000e+03 | 67.000000 | 7.570000e+02 |
| 75% | 117219.250000 | 1.597875e+04 | 486.000000 | 6.270000e+03 |
| max | 156292.000000 | 1.867721e+06 | 45974.000000 | 4.174884e+06 |
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
profile.to_file("output1.html")
profile.to_widgets()
profile.to_notebook_iframe()
C:\Users\abcd8\AppData\Local\Temp\ipykernel_8728\2769708720.py:1: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead. import pandas_profiling
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
C:\Users\abcd8\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\discretize_pandas.py:52: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)` discretized_df.loc[:, column] = self._discretize_column(
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
Render widgets: 0%| | 0/1 [00:00<?, ?it/s]
| SNo | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|
| SNo | 1.000000 | 0.426213 | 0.363542 | 0.364149 |
| Confirmed | 0.426213 | 1.000000 | 0.936324 | 0.552633 |
| Deaths | 0.363542 | 0.936324 | 1.000000 | 0.485000 |
| Recovered | 0.364149 | 0.552633 | 0.485000 | 1.000000 |
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 01/22/2020 | Anhui | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| 1 | 2 | 01/22/2020 | Beijing | Mainland China | 1/22/2020 17:00 | 14.0 | 0.0 | 0.0 |
| 2 | 3 | 01/22/2020 | Chongqing | Mainland China | 1/22/2020 17:00 | 6.0 | 0.0 | 0.0 |
| 3 | 4 | 01/22/2020 | Fujian | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| 4 | 5 | 01/22/2020 | Gansu | Mainland China | 1/22/2020 17:00 | 0.0 | 0.0 | 0.0 |
| 5 | 6 | 01/22/2020 | Guangdong | Mainland China | 1/22/2020 17:00 | 26.0 | 0.0 | 0.0 |
| 6 | 7 | 01/22/2020 | Guangxi | Mainland China | 1/22/2020 17:00 | 2.0 | 0.0 | 0.0 |
| 7 | 8 | 01/22/2020 | Guizhou | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| 8 | 9 | 01/22/2020 | Hainan | Mainland China | 1/22/2020 17:00 | 4.0 | 0.0 | 0.0 |
| 9 | 10 | 01/22/2020 | Hebei | Mainland China | 1/22/2020 17:00 | 1.0 | 0.0 | 0.0 |
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|---|---|
| 156282 | 156283 | 11/15/2020 | Yukon | Canada | 2020-11-16 05:25:57 | 24.0 | 1.0 | 22.0 |
| 156283 | 156284 | 11/15/2020 | Yunnan | Mainland China | 2020-11-16 05:25:57 | 217.0 | 2.0 | 209.0 |
| 156284 | 156285 | 11/15/2020 | Zabaykalsky Krai | Russia | 2020-11-16 05:25:57 | 15222.0 | 219.0 | 12262.0 |
| 156285 | 156286 | 11/15/2020 | Zacatecas | Mexico | 2020-11-16 05:25:57 | 13501.0 | 1120.0 | 0.0 |
| 156286 | 156287 | 11/15/2020 | Zakarpattia Oblast | Ukraine | 2020-11-16 05:25:57 | 19384.0 | 468.0 | 8382.0 |
| 156287 | 156288 | 11/15/2020 | Zaporizhia Oblast | Ukraine | 2020-11-16 05:25:57 | 18484.0 | 164.0 | 3021.0 |
| 156288 | 156289 | 11/15/2020 | Zeeland | Netherlands | 2020-11-16 05:25:57 | 5041.0 | 86.0 | 0.0 |
| 156289 | 156290 | 11/15/2020 | Zhejiang | Mainland China | 2020-11-16 05:25:57 | 1291.0 | 1.0 | 1279.0 |
| 156290 | 156291 | 11/15/2020 | Zhytomyr Oblast | Ukraine | 2020-11-16 05:25:57 | 22225.0 | 368.0 | 12266.0 |
| 156291 | 156292 | 11/15/2020 | Zuid-Holland | Netherlands | 2020-11-16 05:25:57 | 129188.0 | 2031.0 | 0.0 |
VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…
print(df.isna().any())
SNo False ObservationDate False Province/State True Country/Region False Last Update False Confirmed False Deaths False Recovered False dtype: bool
df.isna().sum()
SNo 0 ObservationDate 0 Province/State 44313 Country/Region 0 Last Update 0 Confirmed 0 Deaths 0 Recovered 0 dtype: int64
NAN = [(c, df[c].isna().mean()*100) for c in df]
NAN_df = pd.DataFrame(NAN, columns =['column_name', 'percentage'])
NAN_df
| column_name | percentage | |
|---|---|---|
| 0 | SNo | 0.000000 |
| 1 | ObservationDate | 0.000000 |
| 2 | Province/State | 28.352699 |
| 3 | Country/Region | 0.000000 |
| 4 | Last Update | 0.000000 |
| 5 | Confirmed | 0.000000 |
| 6 | Deaths | 0.000000 |
| 7 | Recovered | 0.000000 |
df.columns
Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
dtype='object')
df['Province/State'] = df['Province/State'].fillna('Unknown')
df.isna().any()
SNo False ObservationDate False Province/State False Country/Region False Last Update False Confirmed False Deaths False Recovered False dtype: bool
df.info()
df[['Confirmed', 'Deaths', 'Recovered']] = df[['Confirmed', 'Deaths', 'Recovered']].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 156292 entries, 0 to 156291 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SNo 156292 non-null int64 1 ObservationDate 156292 non-null object 2 Province/State 156292 non-null object 3 Country/Region 156292 non-null object 4 Last Update 156292 non-null object 5 Confirmed 156292 non-null float64 6 Deaths 156292 non-null float64 7 Recovered 156292 non-null float64 dtypes: float64(3), int64(1), object(4) memory usage: 9.5+ MB <class 'pandas.core.frame.DataFrame'> RangeIndex: 156292 entries, 0 to 156291 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SNo 156292 non-null int64 1 ObservationDate 156292 non-null object 2 Province/State 156292 non-null object 3 Country/Region 156292 non-null object 4 Last Update 156292 non-null object 5 Confirmed 156292 non-null int32 6 Deaths 156292 non-null int32 7 Recovered 156292 non-null int32 dtypes: int32(3), int64(1), object(4) memory usage: 7.8+ MB
df['Country/Region'] = df['Country/Region'].replace('Mainland China','China')
df['Country/Region'] = df['Country/Region'].replace('US','United States')
df['Country/Region'] = df['Country/Region'].replace('UK','United Kingdom')
df.head()
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 01/22/2020 | Anhui | China | 1/22/2020 17:00 | 1 | 0 | 0 |
| 1 | 2 | 01/22/2020 | Beijing | China | 1/22/2020 17:00 | 14 | 0 | 0 |
| 2 | 3 | 01/22/2020 | Chongqing | China | 1/22/2020 17:00 | 6 | 0 | 0 |
| 3 | 4 | 01/22/2020 | Fujian | China | 1/22/2020 17:00 | 1 | 0 | 0 |
| 4 | 5 | 01/22/2020 | Gansu | China | 1/22/2020 17:00 | 0 | 0 | 0 |
df['Active_case'] = df['Confirmed'] - df['Deaths'] - df['Recovered']
df.head(5)
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | Active_case | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 01/22/2020 | Anhui | China | 1/22/2020 17:00 | 1 | 0 | 0 | 1 |
| 1 | 2 | 01/22/2020 | Beijing | China | 1/22/2020 17:00 | 14 | 0 | 0 | 14 |
| 2 | 3 | 01/22/2020 | Chongqing | China | 1/22/2020 17:00 | 6 | 0 | 0 | 6 |
| 3 | 4 | 01/22/2020 | Fujian | China | 1/22/2020 17:00 | 1 | 0 | 0 | 1 |
| 4 | 5 | 01/22/2020 | Gansu | China | 1/22/2020 17:00 | 0 | 0 | 0 | 0 |
設定最近的日期
new_df = df[df['ObservationDate'] == max(df['ObservationDate'])].reset_index(drop=True)
new_df.head(5)
| SNo | ObservationDate | Province/State | Country/Region | Last Update | Confirmed | Deaths | Recovered | Active_case | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 155535 | 11/15/2020 | Unknown | Afghanistan | 2020-11-16 05:25:57 | 43240 | 1617 | 35092 | 6531 |
| 1 | 155536 | 11/15/2020 | Unknown | Albania | 2020-11-16 05:25:57 | 27830 | 623 | 12889 | 14318 |
| 2 | 155537 | 11/15/2020 | Unknown | Algeria | 2020-11-16 05:25:57 | 67679 | 2154 | 44633 | 20892 |
| 3 | 155538 | 11/15/2020 | Unknown | Andorra | 2020-11-16 05:25:57 | 5872 | 76 | 4747 | 1049 |
| 4 | 155539 | 11/15/2020 | Unknown | Angola | 2020-11-16 05:25:57 | 13451 | 322 | 6444 | 6685 |
計算當日全球總數
df_world = new_df.groupby('ObservationDate')[['Confirmed', 'Deaths', 'Recovered','Active_case']].sum().reset_index()
df_world.head()
| ObservationDate | Confirmed | Deaths | Recovered | Active_case | |
|---|---|---|---|---|---|
| 0 | 11/15/2020 | 54370186 | 1317139 | 34955148 | 18097899 |
import plotly.figure_factory as ff
fig = ff.create_table(df_world)
fig.update_layout(title_text='Coronavirus in the word : ')
fig.update_layout({'margin':{'t':50}})
fig.show()
ans = new_df.groupby('Country/Region')[['Confirmed','Deaths','Recovered','Active_case']].sum().reset_index()
ans.sort_values('Confirmed',ascending=False).head(10)
| Country/Region | Confirmed | Deaths | Recovered | Active_case | |
|---|---|---|---|---|---|
| 182 | United States | 11036935 | 246214 | 4174884 | 6615837 |
| 79 | India | 8845127 | 130070 | 8249579 | 465478 |
| 23 | Brazil | 5863093 | 165798 | 5279452 | 417843 |
| 61 | France | 1915713 | 42601 | 139760 | 1733352 |
| 142 | Russia | 1910149 | 32885 | 1429565 | 447699 |
| 162 | Spain | 1458591 | 40769 | 150376 | 1267446 |
| 181 | United Kingdom | 1372884 | 52026 | 3121 | 1317737 |
| 6 | Argentina | 1310491 | 35436 | 1129102 | 145953 |
| 37 | Colombia | 1198746 | 34031 | 1104956 | 59759 |
| 85 | Italy | 1178529 | 45229 | 420810 | 712490 |
import plotly.express as px
wide_df = ans.sort_values('Confirmed',ascending=False)
fig = px.bar(wide_df, x="Country/Region", y=["Confirmed", "Deaths", "Recovered","Active_case"], title="Covid-19", text_auto=True)
fig.show()
當日確診前10國家
import plotly.express as px
wide_df = ans.sort_values('Confirmed',ascending=False).head(10)
fig = px.bar(wide_df, x="Country/Region", y=["Confirmed"], title="Confirmed", text_auto=True)
fig.show()
當日康復前10國家
import plotly.express as px
wide_df = ans.sort_values('Recovered',ascending=False).head(10)
fig = px.bar(wide_df, x="Country/Region", y=["Recovered"], title="Recovered", text_auto=True)
fig.show()
全球當日確診人數分布
from pyecharts import options as opts
from pyecharts.charts import Map
c = (
Map()
.add("Confirmed", [list(z) for z in zip(list(ans['Country/Region']), list(ans['Confirmed']))], "world")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="Convid-19確診人數分布"),
visualmap_opts=opts.VisualMapOpts(max_=10000000),
)
.render_notebook()
)
c
全球當日康復人數分布
from pyecharts import options as opts
from pyecharts.charts import Map
d = (
Map()
.add("Confirmed", [list(z) for z in zip(list(ans['Country/Region']), list(ans['Recovered']))], "world")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="Convid-19康復人數分布"),
visualmap_opts=opts.VisualMapOpts(max_=10000000),
)
.render_notebook()
)
d